## This script cleans the data and runs the analysis for Kenya ##
# preamble and load data #### 
## load packages ##

rm(list=ls())


library(foreign)
library(car)
library(ggplot2)
library(plyr)
library(Rmisc)
library(grid)
library(stargazer)
library(lmtest)
library(sandwich)
library(multiwayvcov)
library(purrr)
library(dotwhisker)
library(broom)
library(gtools)
library(stringr)
library(DataCombine)
library(tidyr)
library(tidyverse)
library(haven)
library(survey)
library(data.table)
library(xtable)
library(sf)

# set working directory
setwd("~/Dropbox/facebook sampling/replication/kenya replication")
data_exports<-"../data_exports/"

## define a few functions ##

my.mean <- function(x){mean(x,na.rm=T)}
my.se <- function(x){sd(x,na.rm=T)/sqrt(length(na.omit(x)))}


### set seed ###
set.seed(60605)

## load 3 datasets ##

# facebook survey # 
fb <- read.csv("data sets/kenya_facebook.csv")

# afrobarometer 2019 (round 8) #
# download from: https://www.afrobarometer.org/survey-resource/kenya-round-8-data-2019/
ab <- read_sav("data sets/ken_r8.data_.new_.final_.wtd_release.31mar21.sav")

# census #
## if needed, uncomment line below to install rKenyaCensus package (https://shelkariuki.netlify.app/blog/rkenyacensus/ )
# install.packages("devtools")
# devtools::install_github("Shelmith-Kariuki/rKenyaCensus")
library(rKenyaCensus)
data(DataCatalogue)



# clean data sets #### 
# relevel edu variable in fb - clean all leading/trailing spaces in character vars
fb <- fb %>%
  mutate(edu = str_trim(edu, side = "right"))

fb$edu_relevel <- factor(fb$edu, levels = c("No formal schooling","Informal schooling only (including Koranic schooling)",
                                            "Some primary schooling","Primary school completed",
                                            "Intermediate school or Some secondary school / high school",
                                            "Secondary school / high school completed",
                                            "Post-secondary qualifications, other than university e.g. a diploma or degree from a polytechnic or college",
                                            "Some university","University completed","Post-graduate"))
fb$edu_num <- as.numeric(fb$edu_relevel)


# prepare data for weighting
# define n of fb sample
fbN <-nrow(fb) #1528

## get population numbers --> frequencies of >18 pop
# 18-29
# 30-49
# 50-59
# 60+

## age ##
total18pop <- V3_T2.2 %>% 
  filter(
    !Age %in% c("Total","0-4","5-9","10-14","15-19","20-24","25-29",
                "30-34","35-39","40-44","45-49","50-54","55-59","60-64",
                "65-69","70-74","75-79","80-84","85-89","90-94","95-99","NotStated")
  ) %>% 
  mutate(Age=dplyr::recode(Age, "100+"="100"),
         Age = as.numeric(Age)) %>%
  summarise(
    total18 = sum(Total[Age>=18] - 687), # subtract not stated ages from total
    male = sum(Male[Age>=18]),
    female = sum(Female[Age>=18]),
    age1829 = sum(Total[Age>=18 & Age<=29]),
    age3049 = sum(Total[Age>=30 & Age<=49]),
    age5059 = sum(Total[Age>=50 & Age<=59]),
    age60 = sum(Total[Age>=60])
  )

# calculate proportion of pop * fb sample size 
total18plus <- total18pop[1]

pop.prop <- (total18pop/as.numeric(total18plus))
pop.prop.fbn <- (total18pop/as.numeric(total18plus))*fbN


# urban pop > 18yo
urban <- V3_T2.2b %>% filter(Age!="Total",
                             Age!="0 - 4", 
                             Age!="5-9",
                             Age!="10-14",
                             Age!="15-19",
                             Age!="20-24",
                             Age!="25-29",
                             Age!="30-34",
                             Age!="35-39",
                             Age!="40-44",
                             Age!="45-49",
                             Age!="50-54",
                             Age!="55-59",
                             Age!="60-64",
                             Age!="65-69",
                             Age!="70-74",
                             Age!="75-79",
                             Age!="80-84",
                             Age!="85-89",
                             Age!="90-94",
                             Age!="95-99",
                             Age!="NotStated",
                             Age!="Not Stated") %>%
  mutate(Age=dplyr::recode(Age, "100+"="100"),
         Age = as.numeric(Age))%>% 
  summarise(
    urban = sum(Total[Age>=18])
  )                                   

urban.prop <- urban/total18plus
rural.prop <- 1-urban.prop



## tribe ##
# want % luo and % kikuyu
# retrieved from file: "census_tables_cleaned_VOLUME IV KHPC 2019.xls" (tab 2.31)

luo <- 5066966/47067376
kikuyu <- 8148668/47067376

tribe.prop <- cbind(luo, kikuyu)

## religion ##
# want % Christian and % Muslim and % No religion
# retrieved from file: "census_tables_cleaned_VOLUME IV KHPC 2019.xls" (tab 2.30)

totalrel <- 47213282 - 73253 - 6909 # total - dont knows - not stated

christian <- (9726169 +	15777473 + 9648690 + 3292573 + 201263 + 1732911)/totalrel
muslim <- 5152194/totalrel
norel <- 755750/totalrel

religion.prop <- cbind(christian, muslim, norel)


## education ##

#View(V4_T2.5) # Distribution of Population Aged 3 Years and Above by Highest Level of Education Completed

# CATEGORIES: “Didn’t complete secondary school,” “Secondary degree,” and “College degree or higher"

totaledu <- 36212477 - (101326 + 9132) # minus DK / not stated 
nonsec <- 3616843 + 18341098 + 51996 # preprimary + primary + madrasa
secondary <- 9787320 + 2755273 #includes technical
collegeplus <- 1511943

edu.prop <- cbind(nonsec,secondary,collegeplus)/totaledu



## create tables of frequencies for each variable ##

pop.all <- cbind(pop.prop[-1],urban.prop, religion.prop, tribe.prop, edu.prop)  


# census table of demographics #

dem.cens <- pop.all[c(2:15)]
names(dem.cens)[2:5] <- c("age1","age2","age3","age4")
dem.cens <- as.data.frame(t(dem.cens))

names(dem.cens)[1] <- "mean"
dem.cens$variable <- rownames(dem.cens)
dem.cens$se <- NA
dem.cens$ci.low <- NA
dem.cens$ci.high <- NA
dem.cens$sample <- "Census"


## create frequencies for census categories for weighting ##
sex_dist <- tibble(sex = c("Male", "Female"), Freq = c(round((1-pop.prop$female)*fbN),round(pop.prop$female*fbN)))

age_dist4 <- setDT(round(as.data.frame(t(pop.prop[4:7])*fbN)), keep.rownames = TRUE)[]
names(age_dist4) <- c("agegroup4","Freq")

urbrur_dist <- setDT(as.data.frame(t(cbind(round(urban.prop*fbN),round(rural.prop*fbN)))), keep.rownames = TRUE)[]
names(urbrur_dist) <- c("urbrur","Freq")
urbrur_dist$urbrur[2] <- "rural"

edu_dist <- setDT(round(as.data.frame(t(edu.prop*fbN))), keep.rownames = TRUE)[]
names(edu_dist) <- c("edu3","Freq")



## RUN RAKING ##
## create weights for fb data ##

## sex
summary(factor(fb$sex))

## age
fb <- fb %>% 
  mutate(
    #agegroup2 = ifelse(age<32,"under32","over32"),
    agegroup4 = ifelse(age<30,"age1829",
                       ifelse(age>=30 & age<50, "age3049",
                              ifelse(age>=50 & age<60, "age5059","age60")))
  )

## define urbrur variable in survey data ##
table(fb$NEW_rururb_self)

# do most ppl who live 20-30 min from town say it's rural or urban?
table(fb[fb$ride_town=="20-30 minutes",]$NEW_rururb_self)
# 73 = rural, 41 = urban

# same coding previously
table(fb$ride_town)
fb <- fb %>% 
  filter(ride_town!="-99") %>% 
  mutate(
    urbrur = ifelse(ride_town %in% c("20-30 minutes","30-60 minutes","more than 60 minutes"), "rural","urban") 
  )

table(fb$urbrur)

## education -- 3 categories: less than sec, secondary/technical, university+

fb <- fb %>% mutate(
  edu_3 = ifelse(edu_relevel %in% c("No formal schooling","Informal schooling only (including Koranic schooling)","Some primary schooling","Primary school completed","Intermediate school or Some secondary school / high school"), "nonsec",
                 ifelse(edu_relevel %in% c("Secondary school / high school completed","Post-secondary qualifications, other than university e.g. a diploma or degree from a polytechnic or college","Some university"), "secondary","collegeplus")
  )
)

fb$edu3 <- factor(fb$edu_3, levels = c("nonsec","secondary","collegeplus"))


## create survey design object with no clusters and unit weights, from survey sample
fb_unweighted <- svydesign(ids=~0,data=fb)

## create raked weights
rake_agesexgeog <- rake(design = fb_unweighted, ## survey design object from above
                        sample.margins = list(~sex, ~urbrur, ~edu3, ~agegroup4), #, ## names of weighting variables in dataset
                        population.margins = list(sex_dist, urbrur_dist, edu_dist, age_dist4))


# add weights to fb dataset
fb$weight<-weights(rake_agesexgeog)

## examine weights
plot(density(weights(rake_agesexgeog)))

quantile(weights(rake_agesexgeog),probs=c(.05,.5,.95))

length(weights(rake_agesexgeog)[weights(rake_agesexgeog)>quantile(weights(rake_agesexgeog),probs=c(.95))])
## 71 people with weights above the 95 percentile

# trim weights
rake.trim <- trimWeights(rake_agesexgeog, lower=-Inf, #quantile(weights(rake_agesexgeog),probs=c(.05)),
                         upper=quantile(weights(rake_agesexgeog),probs=c(.95)),strict=TRUE)

fb$weight_trimmed<-weights(rake.trim)
# min = .6674, max =  2.456, mean = 0.897

## Save survey data set with weights ##
write_csv(fb,file="data sets/kenya_facebook_cleaned.csv")


#      Creating data table for DEMOGRAPHIC figure (FIGURE 2) #### 
# define demographic variables of interest
dem.vars <- c("female","age1","age2","age3","age4", # ages1-4: "18-29","30-49","50-59","60+",
              "christian","muslim","norel",
              "nonsec","secondary", "collegeplus",
              "luo","kikuyu",
              "urban")

## Facebook survey Data ####

fb$female <- ifelse(fb$sex == "Female",1,0)

fb$age1 <- ifelse(fb$age %in% c(18:29),1,0)
fb$age2 <- ifelse(fb$age %in% c(30:49),1,0)
fb$age3 <- ifelse(fb$age %in% c(50:59),1,0)
fb$age4 <- ifelse(fb$age >= 60,1,0)

fb$christian <- ifelse(fb$religion == "Christian",1,
                       ifelse(fb$religion %in% c(-99),NA,0))
fb$muslim <- ifelse(fb$religion == "Muslim",1,
                    ifelse(fb$religion %in% c(-99),NA,0))
fb$norel <- ifelse(fb$religion == "None",1,
                   ifelse(fb$religion %in% c(-99),NA,0))

fb$nonsec <- ifelse(fb$edu3 == "nonsec",1,0)
fb$secondary <- ifelse(fb$edu3 == "secondary",1,0)
fb$collegeplus <- ifelse(fb$edu3 == "collegeplus",1,0)

fb$luo <- ifelse(fb$tribe == "Luo",1,
                 ifelse(fb$tribe == -99,NA,0))
fb$kikuyu <- ifelse(fb$tribe == "Kikuyu",1,
                    ifelse(fb$tribe == -99,NA,0))

fb$urban <- ifelse(fb$urbrur=="urban",1,0)

# function to get means/std devs (raw and weighted) for demographic variables
fb.dems <- fb %>% 
  select(all_of(dem.vars)) %>% 
  map_df(~ tibble(
    mean = my.mean(.x), 
    se = my.se(.x), 
    ci.low = mean - 1.96*se,
    ci.high = mean + 1.96*se
  ), .id = "variable")

fb.dems$sample <- "Facebook Sample"


# function to calculate weighted mean and standard error (same as indonesia function):

calculate_weighted_stats <- function(data, var_names, weight_col, ids = ~1) {
  # Create the survey design
  design <- svydesign(ids = ids, data = data, weights = data[[weight_col]])
  
  # Initialize an empty data frame to store results
  results <- tibble(variable = character(), weighted_mean = numeric(), weighted_se = numeric(),
                    ci.low = numeric(), ci.high = numeric())
  
  # Loop through each variable and calculate weighted mean and SE
  for (var in var_names) {
    # Calculate weighted mean and SE
    mean_se <- svymean(as.formula(paste0("~", var)), design, na.rm = TRUE)
    
    # Extract results and add to the data frame
    results <- results %>% 
      add_row(variable = var, 
              weighted_mean = coef(mean_se),
              weighted_se = SE(mean_se),
              ci.low = weighted_mean - 1.96*weighted_se,
              ci.high = weighted_mean + 1.96*weighted_se)
  }
  names(results) <- c("variable","mean","se","ci.low","ci.high")
  return(results)
}

fb.dems.weighted <- calculate_weighted_stats(fb, dem.vars, "weight_trimmed")
fb.dems.weighted$sample <- "Facebook Sample (weighted)"

fb.dems.all <- rbind(fb.dems, fb.dems.weighted)


##  Afrobarometer Data  ####

# female
ab$female <- ifelse(ab$Q101==2,1,0)

# 18-29, 30-49, 50-59, 60+
ab$age <- as.numeric(ab$Q1)
ab$age1 <- ifelse(ab$age %in% c(18:29),1,0)
ab$age2 <- ifelse(ab$age %in% c(30:49),1,0)
ab$age3 <- ifelse(ab$age %in% c(50:59),1,0)
ab$age4 <- ifelse(ab$age >= 60,1,0)

# christian 
ab$christian <- ifelse(ab$Q98A %in% c(1:17,30:33),1,
                       ifelse(ab$Q98A %in% c(9995,9998,9999,-1),NA,0))
# muslim
ab$muslim <- ifelse(ab$Q98A %in% c(18:24),1,
                    ifelse(ab$Q98A %in% c(9995,9998,9999,-1),NA,0))
# no religion
ab$norel <- ifelse(ab$Q98A == 0,1,
                   ifelse(ab$Q98A %in% c(9995,9998,9999,-1),NA,0))

# education - 3 categories
# some - but not completed coded in prior category (e.g. some secondary included in nonsec, some college included in secondary)
ab$edu <- ifelse(ab$Q97 %in% c(-1,98), NA, ab$Q97)
ab$nonsec <- ifelse(ab$edu<5,1,0)
ab$secondary <- ifelse(ab$edu %in% c(5:7),1,0)
ab$collegeplus <- ifelse(ab$edu>7,1,0)

# luo
ab$luo <- ifelse(ab$Q81 == 301,1,
                 ifelse(ab$Q81 > 9990, NA,0))
# kikuyu
ab$kikuyu <- ifelse(ab$Q81 == 300,1,
                    ifelse(ab$Q81 > 9990, NA,0))
# urban
ab$urban <- ifelse(ab$URBRUR==1,1,0)

# calculate ab demographic mean/se
ab.dems <- ab %>% 
  select(all_of(dem.vars)) %>% 
  map_df(~ tibble(
    mean = my.mean(.x), 
    se = my.se(.x), 
    ci.low = mean - 1.96*se,
    ci.high = mean + 1.96*se
  ), .id = "variable")

ab.dems$sample <- "AB Survey"


##  calculate weighted average for Afrobarometer data ##
AB_weighted <- calculate_weighted_stats(data = ab,var_names = dem.vars,weight_col = "withinwt_hh", ids = ~1)
AB_weighted$sample <- "AB (weighted)"


all.dems <- bind_rows(dem.cens, as.data.frame(fb.dems.all), as.data.frame(ab.dems), as.data.frame(AB_weighted))
rownames(all.dems) <- NULL

## output data for figure 2: demographic comparisons ####
write.csv(all.dems,file=paste0(data_exports,"fig2_demos_kenya.csv"))

## SEE PYTHON CODE FOR TURNING TABLE INTO FIGURE ##

## ACCURACY OF FB TARGETING: TABLE 2 ####

# calculate % of people who reported same/"correct" demographic as FB assigned them (identified by the ad strata they entered from)

## ads by gender
fb <- fb %>% 
  mutate(
    femalead = ifelse(grepl("_f",location),1,0),
    malead = ifelse(grepl("_m",location),1,0),
    sexadright = ifelse(grepl("_f",location) & sex=="Female" |
                          grepl("_m",location) & sex=="Male",
                        1,0)
  )

sum(fb$sexadright)/sum(fb$femalead==1 | fb$malead==1) 
# 91% of respondents recruited from male/female ads self report the gender of the ad target

## ads by age 
fb <- fb %>% 
  mutate(
    over32ad = ifelse(grepl("old",location),1,0),
    oldadright = ifelse(grepl("old",location) & age>=32,1,0)
  )

oldad <- fb[fb$over32ad==1,]
table(oldad$oldadright)/nrow(oldad)
# 44% of old ads correctly got ppl >=32 yo

## ads by education (not specified)
fb <- fb %>% 
  mutate(
    unspeceduad = ifelse(grepl("eduna",location),1,0),
    eduadright = ifelse(grepl("eduna",location) & edu_num <= 5,1,0)
  )
eduad <- fb[fb$unspeceduad==1,]
table(eduad$eduadright)/nrow(eduad)
# 12% of respondents who came from unspecified education ad had some secondary or less education (most had more)

## ads by location ##
# import shapefiles for spatial data #
counties_shp <- st_read('gis/kenya_counties/County.shp')
regions_shp  <- st_read('gis/kenya_region_shapefile/kenya_region_shapefile.shp')

# spatial join to know which counties in which regions
counties_shp <- st_transform(counties_shp, st_crs(regions_shp))

# Perform a spatial intersection to get overlapping geometries between counties and regions
intersection <- st_intersection(counties_shp, regions_shp)

# Calculate the area of each intersection (part of county that overlaps with a region)
intersection <- intersection %>%
  mutate(intersection_area = st_area(.))

# Group by county and region to find the largest intersection area per county
county_region_areas <- intersection %>%
  group_by(county_id = COUNTY, region_name = name_1) %>%  
  summarize(total_intersection_area = sum(intersection_area), .groups = "drop")

# For each county, select the region with the largest intersection area
counties_with_regions <- county_region_areas %>%
  group_by(county_id) %>%
  slice_max(order_by = total_intersection_area, n = 1) %>%
  ungroup()

# this is county - region pairs
county_region <- as.data.frame(counties_with_regions)[,c("county_id","region_name")]

# clean county names
county_region <- county_region %>%
  mutate(county_id = str_replace_all(str_to_lower(county_id), "[[:punct:][:space:]]", "")) %>% 
  mutate(region_name = str_replace_all(str_to_lower(region_name), "[[:punct:][:space:]]", "")) %>% 
  mutate(county_id = str_replace_all(county_id, c("keiyomarakwet" = "elgeyomarakwet", 
                                            "tharaka" = "tharakanithi")))

# read coordinates of kmeans targets used in FB ads
ab_targets <- read.csv("data sets/afrobarometer_kmeans_targets.csv")

# Convert to an sf object, specifying the latitude and longitude columns as coordinates
ab_targets <- st_as_sf(ab_targets, coords = c("centroid_x_joint", "centroid_y_joint"), crs = 4326)  # 4326 is the EPSG code for WGS84

# create a 12 mi ~ 19312 meter radius around each centroid (this is how FB ads were targeted)
buffer_radius <- round(12 * 1609.32)
ab_targets_buffer <- st_buffer(ab_targets, dist = buffer_radius)

# get regions for radii used for ad targeting
# Ensure both layers are using the same Coordinate Reference System (CRS)
ab_targets_buffer <- st_transform(ab_targets_buffer, st_crs(regions_shp))

# perform a spatial intersection to identify regions that these buffers overlap
regions_touched <- st_intersection(ab_targets_buffer, regions_shp)

# create dataframe that drops geometry
regions_touched_df <- regions_touched %>%
  st_drop_geometry()

# Group the data by 'kmeans_joint' and create a concatenated string of regions for each kmeans_joint
regions_wide <- regions_touched_df %>%
  group_by(kmeans_joint) %>%
  summarize(regions = paste(unique(name_1), collapse = ", "), .groups = "drop") %>% 
  mutate(regions = str_replace_all(tolower(regions), "[- ]", ""))

# define regions
regions <- county_region$region_name


# Create new columns for region and kmeans_number from survey data
fb <- fb %>%
  mutate(
    # Extract region if it matches one of the specified regions, otherwise NA
    ad_region = ifelse(str_extract(location, paste(regions, collapse = "|")) %in% regions,
                    str_extract(location, paste(regions, collapse = "|")),
                    NA),
    # Extract the kmeans number if it matches the "kmeans_" pattern, otherwise NA
    kmeans_number = ifelse(str_detect(location, "kmeans_"),
                           str_extract(location, "kmeans_\\d+"),
                           NA)
  )

# Convert kmeans_number to numeric for joining
fb <- fb %>%
  mutate(kmeans_number = as.numeric(str_replace(kmeans_number, "kmeans_", "")))

# map kmeans_number to regions using regions_wide data

# Join fb with kmeans_region to get the region associated with each kmeans_number
fb <- fb %>%
  left_join(regions_wide, by = c("kmeans_number" = "kmeans_joint"))


## AD REGION == "ad_region" -- if that is NA then ad_region == "regions" - from kmeans targeting ##

# using county they say they live in "county" -- compare match btw self-reported county-region and ad-targeted region
fb <- fb %>% 
  mutate(county = county %>%
           str_remove("(?i)county") %>%             # Remove the word "county" (case-insensitive)
           str_to_lower() %>%                       # Convert to lowercase
           str_replace_all("[[:punct:][:space:]]", "")  # Remove all punctuation and spaces
  )

# Join fb with county_region to map self-reported county to its region
fb <- fb %>%
  left_join(county_region, by = c("county" = "county_id")) %>%
  rename(self_region = region_name)  # Rename region_name to self_region

# create binary variable: 1==self-region matches ad_region or is in regions, 0 otherwise
fb <- fb %>%
  mutate(
    region_match = ifelse(
     (self_region == ad_region & !is.na(ad_region)) |
       (!is.na(regions) & str_detect(regions, fixed(self_region))), 1, 0)
  )

sum(fb$region_match)/nrow(fb) #73%



# PREDICTORS OF NON-RESPONSE: FIGURE 3  ####

## load dataset that includes respondents who didn't finish the survey (who are excluded from analysis above) ##
fb.all <- read.csv("data sets/kenya_facebook_plusnonresponse.csv")

## calculate attrition rate (of the ppl who consented to survey)
table(fb.all$consent) # everyone in dataframe consented

# define attritors as finished in <5 min OR didn't finish at all 
fb.all$attrit <- ifelse(fb.all$finished==0 | fb.all$finished==1 & as.numeric(fb.all$Duration..in.seconds.)/60<5, 1, 0)

sum(fb.all$attrit)/nrow(fb.all) #25% attrition

## regression of demo vars predicting attrition ##
nonresp.reg <- lm(attrit ~ ad_old + ad_male, data = fb.all)
sumnonresp <- summary(nonresp.reg)
xtable(nonresp.reg)

coefficients_summary <- sumnonresp$coefficients

# Extract the standard error
SE_beta_hat <- coefficients_summary[, "Std. Error"]

# Extract the estimate
beta_hat <- coefficients_summary[, "Estimate"]

# t critical value for 95% CI, you can get this from qt() function
# Degrees of freedom (df) can be extracted from the model
df <- sumnonresp$df[2]
alpha <- 0.05
t_critical <- qt(1 - alpha/2, df)

# Calculate the margin of error
margin_of_error <- t_critical * SE_beta_hat

# Lower and upper bounds
conf.low <- beta_hat - margin_of_error
conf.high <- beta_hat + margin_of_error

att_tab <- as.data.frame(cbind(coefficients_summary, conf.low, conf.high, df))
names(att_tab) <- tolower(names(att_tab))

## data export for figure 3: predictors of non-response #### 
write.csv(att_tab,paste0(data_exports,"fig3_attritionreg_kenya.csv"))

## PYTHON CODE CREATES FIGURE ## 


# T&K EXPERIMENT: TABLE 3 ####

# calculate % of respondents who pick program A (certain) or program B (risky) policy
# treatment = framing as lives saved or lost


## save ##
# Subset data for relevant categories
subset_save <- fb[fb$asian_save %in% c("Program A", "Program B"), ]

# Calculate weighted counts
weighted_save <- with(subset_save, tapply(weight_trimmed, asian_save, sum))

# Calculate proportions
weighted_prop_save <- weighted_save / sum(weighted_save)


## die ##
# Subset data for relevant categories
subset_die <- fb[fb$asian_die %in% c("Program A", "Program B"), ]

# Calculate weighted counts
weighted_die <- with(subset_die, tapply(weight_trimmed, asian_die, sum))

# Calculate proportions
weighted_prop_die <- weighted_die / sum(weighted_die)



#      Creating data table for PUBLIC OPINION figure: FIGURE 4  ####


att.vars <- c("protest",
              "raise",
              "attendmtg",
              "voted",
              "closeparty",
              "approvepres",
              "freetosay")

## 1. AB data ####

# protested - 11C
ab$protest <- ifelse(ab$Q11C %in% c(2,3,4),1,
                     ifelse(ab$Q11C %in% c(8,9),NA,0))

# contacted govt official ??
# removed this one bc not asked on AB 2019 survey

# raise issue - 11B
ab$raise <- ifelse(ab$Q11B %in% c(2,3,4),1,
                   ifelse(ab$Q11B %in% c(8,9),NA,0))

# attend mtg - 11A
ab$attendmtg <- ifelse(ab$Q11A %in% c(2,3,4),1,
                       ifelse(ab$Q11A %in% c(8,9),NA,0))

# voted - 13
ab$voted <- ifelse(ab$Q13 == 3,1,
                   ifelse(ab$Q13 %in% c(1,2,9),NA,0))

# close to pol party - 91A 
ab$closeparty <- ifelse(ab$Q91A == 1,1,
                        ifelse(ab$Q91A %in% c(-1,8,9),NA,0))

# approve of pres - 51A **same wording for Q but different answer options - may need to drop and replace w/another Q**
ab$approvepres <- ifelse(ab$Q51A %in% c("4","3"),1,
                         ifelse(ab$Q51A %in% c(8,9),NA,0))
## **** not a great option bc we added "neither approve nor disapprove" option that was not in AB ********

# free to speak (say what you think) - 10A 
# code as binary
ab$freetosay <- ifelse(ab$Q10A %in% c(3,4),1, #3-somewhat, 4 - completely
                       ifelse(ab$Q10A %in% c(9),NA,0))


## 2. FB survey data ####

fb <- fb %>% mutate(
  protest = ifelse(engage_diss_protest %in% c("Yes, often","Yes, several times","Yes, once or twice"),1,
                   ifelse(engage_diss_protest == -99,NA,0)),
  raise = ifelse(engage_activity_issu %in% c("Yes, often","Yes, several times","Yes, once or twice"),1,
                 ifelse(engage_activity_issu == -99,NA,0)),
  attendmtg = ifelse(engage_activity_mtg %in% c("Yes, often","Yes, several times","Yes, once or twice"),1,
                     ifelse(engage_activity_mtg == -99,NA,0)),
  voted = ifelse(voted == "I voted",1,ifelse(voted == -99, NA, 0)),
  closeparty = ifelse(polparty == "Yes",1,0),
  approvepres = ifelse(pol_pres %in% c("Somewhat approve","Strongly approve"),1,
                       ifelse(pol_pres == -99,NA,0)),
  freetosay = ifelse(pol_free %in% c("Somewhat free","Completely free"),1,0)
)


# fb attitude table
fb.att.weighted <- calculate_weighted_stats(fb, att.vars, "weight_trimmed")
fb.att.weighted$sample <- "Facebook Sample (weighted)"
names(fb.att.weighted)

fb.att <- fb %>% 
  select(all_of(att.vars)) %>% 
  map_df(~ tibble(
    mean = my.mean(.x), 
    se = my.se(.x),
    ci.low = mean - 1.96*se,
    ci.high = mean + 1.96*se
  ), .id = "variable")

fb.att$sample <- "Facebook Sample"


# calculate ab demographic mean/se
ab.att <- ab %>% 
  select(all_of(att.vars)) %>% 
  map_df(~ tibble(
    mean = my.mean(.x), 
    se = my.se(.x), 
    ci.low = mean - 1.96*se,
    ci.high = mean + 1.96*se
  ), .id = "variable")

ab.att$sample <- "Afrobarometer"

##  calculate weighted average for Afrobarometer data ##
ab.att.weighted <- calculate_weighted_stats(data = ab, var_names = att.vars, weight_col = "withinwt_hh", ids = ~1)
ab.att.weighted$sample <- "Afrobatometer (weighted)"

att.all <- rbind(fb.att,fb.att.weighted, ab.att,ab.att.weighted)

## output data underlying figure 4: political behavior. #### 
## figure is created in Python script
write_csv(att.all,paste0(data_exports,"fig4_behavior_kenya.csv"))

# APPENDIX for Kenya CODE BELOW ####
## TABLE S1 kenya
adclick <- read.csv("data sets/kenya_facebook_plusnonresponse.csv")
1-summary(adclick$ad_male)
adclick$ad_female <- 1-adclick$ad_male
mean <- my.mean(adclick$ad_female)
se <- my.se(adclick$ad_female)
# create 1 row table with mean, se, ci.low, ci.high
adtab <- c(mean, se, mean-1.96*se, mean+1.96*se)
names(adtab) <- c("mean","se","ci.low","ci.high")
adtab <- as.data.frame(t(adtab))

adtab <- adtab %>%
  mutate(
    variable = "ad_female",  # Add the 'variable' column
    sample = "Facebook ad clickers"      # Add the 'sample' column
  )

# Reorder columns so 'variable' comes first and 'sample' last
adtab <- adtab %>%
  select(variable, everything(), sample)

write.csv(adtab,file=paste0(data_exports,"figs1_kenya_adclicker_tab.csv"))
# for Figure S1 see python code

## ATTENTION CHECKS

# age attention check
# while it could be that ppl just weren't paying attention anfb clicking/entering random age
# all of these ppl also passefb the age attention check question!
fbage <- fb[fb$age!=-99 & fb$age_attn_check_v1!=-99,]
# among those who answered both age questions...

fbage$passefb_age_attn <- ifelse(fbage$age==fbage$age_attn_check_v1,1,0)
table(fbage$passefb_age_attn)/nrow(fbage)
# among those who answered both age questions - 98%

table(fbage$passefb_age_attn)/nrow(fb)
# among all respondents -- 98% passed

# how many ppl didn't answer age follow up question:
sum(fb$age_attn_check_v1==-99) #4 didn't answer second age Q

# democracy attention check
# attn check - if put "k" in other box
table(fb$attn_chk1_12) #833 wrote something in the "other" text box
table(fb$attn_chk1_12_TEXT) #-99 if they didnt answer the text option, which is required to pass

# passed if have "k" in text
fb <- fb %>%
  mutate(attn_k_pass = ifelse(str_detect(attn_chk1_12_TEXT, "(?i)\\s*k+\\s*"), 1, 0))

table(fb$attn_k_pass)/dim(fb)[1] #54% passed attention check


# but maybe ppl just skipped the question completely?
fb$skipattn1 <- ifelse(fb$attn_chk1_1==-99 & fb$attn_chk1_2==-99 & fb$attn_chk1_3==-99 &
                         fb$attn_chk1_4==-99 & fb$attn_chk1_5==-99 & fb$attn_chk1_6==-99 &
                         fb$attn_chk1_7==-99 & fb$attn_chk1_8==-99 & fb$attn_chk1_9==-99 &  
                         fb$attn_chk1_10==-99 & fb$attn_chk1_11==-99 & fb$attn_chk1_12==-99,1,0)
table(fb$skipattn1)                      
# only 2 ppl didnt see/skipped the question

## FIGURE S4: see python code #

## TABLE S4: Reach of survey for kenya

# load fb ads dataset
ads <- read.csv("data sets/Kenya_ad_sets_dataframe.csv")

# calculate metrics for Kenya column
kenya_summary <- ads %>%
  summarise(
    Impressions = sum(Impressions, na.rm = TRUE),
    Reach = sum(Reach, na.rm = TRUE),
    Clicks = sum(Link.Clicks, na.rm = TRUE),
    Survey_results = nrow(fb),
    Total_spent = sum(Amount.Spent..USD., na.rm = TRUE),
    Click_through_rate = Clicks / Impressions,
    Completion_rate = Survey_results / Impressions,
    Cost_per_click = Total_spent / Clicks,
    Cost_per_completed_survey = Total_spent / Survey_results
  )

print(kenya_summary)


## TABLE S5: cost of Facebook targeting (median, min, max) for kenya

# Calculate individual metrics and then summarise
kenya_metrics <- ads %>%
  mutate(
    Click_through_rate = (Link.Clicks / Impressions) * 100,
    Completion_rate = (Results / Impressions) * 100,
    Cost_per_click = Amount.Spent..USD. / Link.Clicks,
    Cost_per_completed_survey = Amount.Spent..USD. / Results
  ) %>%
  summarise(
    CTR_Median = median(Click_through_rate, na.rm = TRUE),
    CTR_Max = max(Click_through_rate, na.rm = TRUE),
    CTR_Min = min(Click_through_rate, na.rm = TRUE),
    
    Completion_Median = median(Completion_rate, na.rm = TRUE),
    Completion_Max = max(Completion_rate, na.rm = TRUE),
    Completion_Min = min(Completion_rate, na.rm = TRUE),
    
    CPC_Median = median(Cost_per_click, na.rm = TRUE),
    CPC_Max = max(Cost_per_click, na.rm = TRUE),
    CPC_Min = min(Cost_per_click, na.rm = TRUE),
    
    CPS_Median = median(Cost_per_completed_survey, na.rm = TRUE),
    CPS_Max = max(Cost_per_completed_survey, na.rm = TRUE),
    CPS_Min = min(Cost_per_completed_survey, na.rm = TRUE)
  )

print(round(kenya_metrics,2))


## TABLE S6: Reach of surveys, by education for kenya

# Define Oversample and Initial based on Ad Set Name
ads <- ads %>%
  mutate(Sample_Type = ifelse(grepl("_eduna|_old", Ad.Set.Name), "Oversample", "Initial"))

# Summarize metrics for Kenya Initial sample
kenya_initial <- ads %>%
  filter(Sample_Type == "Initial") %>%
  summarise(
    Reach = sum(Reach, na.rm = TRUE),
    Impressions = sum(Impressions, na.rm = TRUE),
    Clicks = sum(Link.Clicks, na.rm = TRUE),
    Survey_results = sum(Results, na.rm = TRUE),
    Total_spent = sum(Amount.Spent..USD., na.rm = TRUE)
  )

# Summarize metrics for Kenya Oversample
kenya_oversample <- ads %>%
  filter(Sample_Type == "Oversample") %>%
  summarise(
    Reach = sum(Reach, na.rm = TRUE),
    Impressions = sum(Impressions, na.rm = TRUE),
    Clicks = sum(Link.Clicks, na.rm = TRUE),
    Survey_results = sum(Results, na.rm = TRUE),
    Total_spent = sum(Amount.Spent..USD., na.rm = TRUE)
  )

# Display results
print(kenya_initial)
print(kenya_oversample)


## TABLE S7: Cost of Facebook targeting with and without low education oversample

# Calculate metrics for Kenya Initial sample
kenya_initial_metrics <- ads %>%
  filter(Sample_Type == "Initial") %>%
  mutate(
    Click_through_rate = (Link.Clicks / Impressions) * 100,
    Completion_rate = (Results / Impressions) * 100,
    Cost_per_click = Amount.Spent..USD. / Link.Clicks,
    Cost_per_completed_survey = Amount.Spent..USD. / Results
  ) %>%
  summarise(
    CTR_Median = median(Click_through_rate, na.rm = TRUE),
    CTR_Max = max(Click_through_rate, na.rm = TRUE),
    CTR_Min = min(Click_through_rate, na.rm = TRUE),
    
    Completion_Median = median(Completion_rate, na.rm = TRUE),
    Completion_Max = max(Completion_rate, na.rm = TRUE),
    Completion_Min = min(Completion_rate, na.rm = TRUE),
    
    CPC_Median = median(Cost_per_click, na.rm = TRUE),
    CPC_Max = max(Cost_per_click, na.rm = TRUE),
    CPC_Min = min(Cost_per_click, na.rm = TRUE),
    
    CPS_Median = median(Cost_per_completed_survey, na.rm = TRUE),
    CPS_Max = max(Cost_per_completed_survey, na.rm = TRUE),
    CPS_Min = min(Cost_per_completed_survey, na.rm = TRUE)
  )

# Calculate metrics for Kenya Oversample
kenya_oversample_metrics <- ads %>%
  filter(Sample_Type == "Oversample") %>%
  mutate(
    Click_through_rate = (Link.Clicks / Impressions) * 100,
    Completion_rate = (Results / Impressions) * 100,
    Cost_per_click = Amount.Spent..USD. / Link.Clicks,
    Cost_per_completed_survey = Amount.Spent..USD. / Results
  ) %>%
  summarise(
    CTR_Median = median(Click_through_rate, na.rm = TRUE),
    CTR_Max = max(Click_through_rate, na.rm = TRUE),
    CTR_Min = min(Click_through_rate, na.rm = TRUE),
    
    Completion_Median = median(Completion_rate, na.rm = TRUE),
    Completion_Max = max(Completion_rate, na.rm = TRUE),
    Completion_Min = min(Completion_rate, na.rm = TRUE),
    
    CPC_Median = median(Cost_per_click, na.rm = TRUE),
    CPC_Max = max(Cost_per_click, na.rm = TRUE),
    CPC_Min = min(Cost_per_click, na.rm = TRUE),
    
    CPS_Median = median(Cost_per_completed_survey, na.rm = TRUE),
    CPS_Max = max(Cost_per_completed_survey, na.rm = TRUE),
    CPS_Min = min(Cost_per_completed_survey, na.rm = TRUE)
  )

# Display results
print(kenya_initial_metrics)
print(kenya_oversample_metrics)

## TABLE S8: cost per response by respondent type

# # Load the datasets
afrobarometer_kmeans_data <- read.csv("data sets/afrobarometer_kmeans_targets.csv")
ads <- read.csv("data sets/Kenya_ad_sets_dataframe.csv")

# Step 1: Aggregate to classify each kmeans_joint cluster as Urban or Rural
kmeans_classification <- afrobarometer_kmeans_data %>%
  filter(urban != "both") %>%  # Ignore sites classified as "both"
  group_by(kmeans_joint) %>%
  summarise(
    urban_count = sum(urban == "urban"),
    rural_count = sum(urban == "rural")
  ) %>%
  mutate(
    Location = case_when(
      urban_count >= rural_count ~ "Urban", # clusters that have equal number of urban and rural --> urban
      rural_count > urban_count ~ "Rural"
    )
  ) %>%
  select(kmeans_joint, Location)

# Step 2: Add gender and location information to ads
ads <- ads %>%
  mutate(
    Gender = case_when(
      grepl("_f", Ad.Set.Name) ~ "Female",
      grepl("_m", Ad.Set.Name) ~ "Male",
      TRUE ~ NA
    ),
      kmeans_joint = as.integer(str_extract(Ad.Set.Name, "(?<=^kmeans_)\\d+"))
  ) %>%
left_join(kmeans_classification, by = "kmeans_joint")

# Step 3: Calculate total spend for each type: overall, gender, and location categories
kenya_cost_per_type <- ads %>%
    # Calculate cost per response for each observation
  summarise(
    Overall = sum(Amount.Spent..USD., na.rm = TRUE),
    Men = sum(Amount.Spent..USD.[Gender == "Male"], na.rm = TRUE),
    Women = sum(Amount.Spent..USD.[Gender == "Female"], na.rm = TRUE),
    Urban = sum(Amount.Spent..USD.[Location == "Urban"], na.rm = TRUE),
    Rural = sum(Amount.Spent..USD.[Location == "Rural"], na.rm = TRUE)
  )

# divide each by number of respondents who completed survey in each category (as self-reported in survey data)
kenya_cost_per_type/c(nrow(fb),sum(fb$female==0),sum(fb$female==1),sum(fb$urban==1),sum(fb$urban==0))


